Start with a general question
Can I automatically detect emails that are SPAM that are not?
Make it concrete
Can I use quantitative characteristics of the emails to classify them as SPAM/HAM?
http://rss.acs.unt.edu/Rdoc/library/kernlab/html/spam.html
Dear Jeff,
Can you send me your address so I can send you the invitation?
Thanks,
Ben
Dear Jeff,
Can send me your address so I can send the invitation?
Thanks,
Ben
Frequency of you \(= 2/17 = 0.118\)
library(kernlab)
data(spam)
str(spam)
## 'data.frame': 4601 obs. of 58 variables:
## $ make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ num3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ num415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charSemicolon : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ charRoundbracket : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ charSquarebracket: num 0 0 0 0 0 0 0 0 0 0 ...
## $ charExclamation : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ charDollar : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ charHash : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capitalAve : num 3.76 5.11 9.82 3.54 3.54 ...
## $ capitalLong : num 61 101 485 40 40 15 4 11 445 43 ...
## $ capitalTotal : num 278 1028 2259 191 191 ...
## $ type : Factor w/ 2 levels "nonspam","spam": 2 2 2 2 2 2 2 2 2 2 ...
plot(density(spam$your[spam$type=="nonspam"]),
col="blue",main="",xlab="Frequency of 'your'")
lines(density(spam$your[spam$type=="spam"]),col="red")
Our algorithm
plot(density(spam$your[spam$type=="nonspam"]),
col="blue",main="",xlab="Frequency of 'your'")
lines(density(spam$your[spam$type=="spam"]),col="red")
abline(v=0.5,col="black")
prediction <- ifelse(spam$your > 0.5,"spam","nonspam")
head(table(prediction,spam$type)/length(spam$type),10)
##
## prediction nonspam spam
## nonspam 0.4590306 0.1017170
## spam 0.1469246 0.2923278
Accuracy \(\approx 0.459 + 0.292 = 0.751\)
In Sample Error: The error rate you get on the same data set you used to build your predictor. Sometimes called resubstitution error.
Out of Sample Error: The error rate you get on a new data set. Sometimes called generalization error.
Key ideas
library(kernlab); data(spam); set.seed(333)
smallSpam <- spam[sample(dim(spam)[1],size=10),]
spamLabel <- (smallSpam$type=="spam")*1 + 1
plot(smallSpam$capitalAve,col=spamLabel)
Apply Rule 1 to smallSpam
rule1 <- function(x){
prediction <- rep(NA,length(x))
prediction[x > 2.7] <- "spam"
prediction[x < 2.40] <- "nonspam"
prediction[(x >= 2.40 & x <= 2.45)] <- "spam"
prediction[(x > 2.45 & x <= 2.70)] <- "nonspam"
return(prediction)
}
head(table(rule1(smallSpam$capitalAve),smallSpam$type),10)
##
## nonspam spam
## nonspam 5 0
## spam 0 5
Apply Rule 2 to smallSpam
rule2 <- function(x){
prediction <- rep(NA,length(x))
prediction[x > 2.8] <- "spam"
prediction[x <= 2.8] <- "nonspam"
return(prediction)
}
head(table(rule2(smallSpam$capitalAve),smallSpam$type),10)
##
## nonspam spam
## nonspam 5 1
## spam 0 4
Apply to complete spam data
head(table(rule1(spam$capitalAve),spam$type),5)
##
## nonspam spam
## nonspam 2141 588
## spam 647 1225
head(table(rule2(spam$capitalAve),spam$type),5)
##
## nonspam spam
## nonspam 2224 642
## spam 564 1171
mean(rule1(spam$capitalAve)==spam$type)
## [1] 0.7315801
mean(rule2(spam$capitalAve)==spam$type)
## [1] 0.7378831
sum(rule1(spam$capitalAve)==spam$type)
## [1] 3366
sum(rule2(spam$capitalAve)==spam$type)
## [1] 3395
http://en.wikipedia.org/wiki/Overfitting
In general, Positive = identified and negative = rejected. Therefore:
True positive = correctly identified
False positive = incorrectly identified
True negative = correctly rejected
False negative = incorrectly rejected
Medical testing example:
True positive = Sick people correctly diagnosed as sick
False positive= Healthy people incorrectly identified as sick
True negative = Healthy people correctly identified as healthy
False negative = Sick people incorrectly identified as healthy.
http://en.wikipedia.org/wiki/Sensitivity_and_specificity
http://en.wikipedia.org/wiki/Sensitivity_and_specificity
http://www.biostat.jhsph.edu/~iruczins/teaching/140.615/
Mean squared error (MSE):
\[\frac{1}{n} \sum_{i=1}^n (Prediction_i - Truth_i)^2\]
Root mean squared error (RMSE):
\[\sqrt{\frac{1}{n} \sum_{i=1}^n(Prediction_i - Truth_i)^2}\]
http://en.wikipedia.org/wiki/Receiver_operating_characteristic
Approach:
Use the training set
Split it into training/test sets
Build a model on the training set
Evaluate on the test set
Repeat and average the estimated errors
Used for:
Picking variables to include in a model
Picking the type of prediction function to use
Picking the parameters in the prediction function
Comparing different predictors
library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y=spam$type,
p=0.75, list=FALSE)
training1 <- spam[inTrain,]
testing <- spam[-inTrain,]
dim(training1)
## [1] 3451 58
set.seed(32343)
modelFit <- train(type ~.,data=training1, method="glm")
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictors
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9122402 0.8140946
##
##
modelFit <- train(type ~.,data=training1, method="glm")
modelFit$finalModel
##
## Call: NULL
##
## Coefficients:
## (Intercept) make address
## -1.381e+14 -2.321e+14 -1.302e+14
## all num3d our
## 7.966e+13 7.800e+13 2.235e+14
## over remove internet
## 1.187e+14 3.078e+14 2.496e+14
## order mail receive
## 2.880e+14 -3.152e+13 1.064e+14
## will people report
## -7.044e+13 -2.164e+14 -4.492e+13
## addresses free business
## -1.099e+13 1.871e+14 9.344e+13
## email you credit
## 1.571e+14 4.467e+13 2.627e+13
## your font num000
## 2.329e+14 5.361e+13 3.283e+14
## money hp hpl
## 1.870e+14 -2.275e+14 -1.082e+14
## george num650 lab
## -2.175e+14 1.056e+14 -1.762e+14
## labs telnet num857
## -3.168e+13 -7.696e+14 2.227e+14
## data num415 num85
## -2.003e+14 -4.322e+14 -1.296e+14
## technology num1999 parts
## 2.703e+14 -2.869e+12 -2.241e+14
## pm direct cs
## -1.209e+14 7.148e+13 -3.497e+14
## meeting original project
## -2.839e+14 -3.522e+14 -3.326e+14
## re edu table
## -2.232e+14 -3.060e+14 -4.648e+14
## conference charSemicolon charRoundbracket
## -5.708e+14 -5.086e+14 -1.671e+14
## charSquarebracket charExclamation charDollar
## -1.779e+14 1.882e+14 1.234e+15
## charHash capitalAve capitalLong
## 3.029e+14 4.115e+12 5.607e+11
## capitalTotal
## 2.183e+11
##
## Degrees of Freedom: 3450 Total (i.e. Null); 3393 Residual
## Null Deviance: 4628
## Residual Deviance: 30930 AIC: 31040
predictions <- predict(modelFit,newdata=testing)
head(predictions,10)
## [1] spam spam spam spam nonspam spam spam spam
## [9] spam spam
## Levels: nonspam spam
confusionMatrix(predictions,testing$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 555 14
## spam 142 439
##
## Accuracy : 0.8643
## 95% CI : (0.8432, 0.8836)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7293
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7963
## Specificity : 0.9691
## Pos Pred Value : 0.9754
## Neg Pred Value : 0.7556
## Prevalence : 0.6061
## Detection Rate : 0.4826
## Detection Prevalence : 0.4948
## Balanced Accuracy : 0.8827
##
## 'Positive' Class : nonspam
##
library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y=spam$type,
p=0.75, list=FALSE)
training2 <- spam[inTrain,]
testing <- spam[-inTrain,]
dim(training2)
## [1] 3451 58
set.seed(32323)
folds <- createFolds(y=spam$type,k=10,
list=TRUE,returnTrain=TRUE)
sapply(folds,length)
## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10
## 4141 4140 4141 4142 4140 4142 4141 4141 4140 4141
folds[[1]][1:10]
## [1] 1 2 3 4 5 6 7 8 9 10
set.seed(32323)
folds <- createFolds(y=spam$type,k=10,
list=TRUE,returnTrain=FALSE)
sapply(folds,length)
## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10
## 460 461 460 459 461 459 460 460 461 460
folds[[1]][1:10]
## [1] 24 27 32 40 41 43 55 58 63 68
set.seed(32323)
folds <- createResample(y=spam$type,times=10,
list=TRUE)
sapply(folds,length)
## Resample01 Resample02 Resample03 Resample04 Resample05 Resample06
## 4601 4601 4601 4601 4601 4601
## Resample07 Resample08 Resample09 Resample10
## 4601 4601 4601 4601
folds[[1]][1:10]
## [1] 1 2 3 3 3 5 5 7 8 12
set.seed(32323)
tme <- 1:1000
folds <- createTimeSlices(y=tme,initialWindow=20,
horizon=10)
names(folds)
## [1] "train" "test"
folds$train[[1]]
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
folds$test[[1]]
## [1] 21 22 23 24 25 26 27 28 29 30
library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y=spam$type,
p=0.75, list=FALSE)
training3 <- spam[inTrain,]
testing <- spam[-inTrain,]
modelFit <- train(type ~.,data=training3, method="glm")
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictors
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9167223 0.8245611
##
##
args(train.default)
## function (x, y, method = "rf", preProcess = NULL, ..., weights = NULL,
## metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric %in%
## c("RMSE", "logLoss"), FALSE, TRUE), trControl = trainControl(),
## tuneGrid = NULL, tuneLength = 3)
## NULL
Continous outcomes: * RMSE = Root mean squared error * RSquared = \(R^2\) from regression models
Categorical outcomes: * Accuracy = Fraction correct * Kappa = A measure of concordance
args(trainControl)
## function (method = "boot", number = ifelse(grepl("cv", method),
## 10, 25), repeats = ifelse(grepl("cv", method), 1, number),
## p = 0.75, search = "grid", initialWindow = NULL, horizon = 1,
## fixedWindow = TRUE, verboseIter = FALSE, returnData = TRUE,
## returnResamp = "final", savePredictions = FALSE, classProbs = FALSE,
## summaryFunction = defaultSummary, selectionFunction = "best",
## preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5),
## sampling = NULL, index = NULL, indexOut = NULL, indexFinal = NULL,
## timingSamps = 0, predictionBounds = rep(FALSE, 2), seeds = NA,
## adaptive = list(min = 5, alpha = 0.05, method = "gls", complete = TRUE),
## trim = FALSE, allowParallel = TRUE)
## NULL
set.seed(1235)
modelFit2 <- train(type ~.,data=training3, method="glm")
modelFit2
## Generalized Linear Model
##
## 3451 samples
## 57 predictors
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9152739 0.8213541
##
##
set.seed(1235)
modelFit3 <- train(type ~.,data=training3, method="glm")
modelFit3
## Generalized Linear Model
##
## 3451 samples
## 57 predictors
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9152739 0.8213541
##
##
Image Credit http://www.cahs-media.org/the-high-cost-of-low-wages
Data from: ISLR package from the book: Introduction to statistical learning
library(ISLR)
library(ggplot2)
library(caret)
library(Hmisc)
library(gridExtra)
data(Wage)
summary(Wage)
## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins logwage
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083 Min. :3.000
## 2. Information:1456 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447
## Median :4.653
## Mean :4.654
## 3rd Qu.:4.857
## Max. :5.763
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
inTrain <- createDataPartition(y=Wage$wage,
p=0.7, list=FALSE)
training4 <- Wage[inTrain,]
testing <- Wage[-inTrain,]
dim(training4); dim(testing)
## [1] 2102 12
## [1] 898 12
featurePlot(x=training4[,c("age","education","jobclass")],
y = training4$wage,
plot="pairs")
qplot(age,wage,data=training4)
qplot(age,wage,colour=jobclass,data=training4)
qq <- qplot(age,wage,colour=education,data=training4)
qq + geom_smooth(method='lm',formula=y~x)
cutWage <- cut2(training4$wage,g=3)
table(cutWage)
## cutWage
## [ 20.9, 93) [ 93.0,119) [118.9,318]
## 713 715 674
p1 <- qplot(cutWage,age, data=training4,fill=cutWage,
geom=c("boxplot"))
p1
library(gridExtra)
p2 <- qplot(cutWage,age, data=training4,fill=cutWage,
geom=c("boxplot","jitter"))
grid.arrange(p1,p2,ncol=2)
t1 <- table(cutWage,training4$jobclass)
t1
##
## cutWage 1. Industrial 2. Information
## [ 20.9, 93) 446 267
## [ 93.0,119) 358 357
## [118.9,318] 265 409
prop.table(t1,1)
##
## cutWage 1. Industrial 2. Information
## [ 20.9, 93) 0.6255259 0.3744741
## [ 93.0,119) 0.5006993 0.4993007
## [118.9,318] 0.3931751 0.6068249
qplot(wage,colour=education,data=training4,geom="density")
library(caret); library(RANN); library(kernlab); data(spam)
inTrain <- createDataPartition(y=spam$type,
p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
hist(training$capitalAve,main="",xlab="ave. capital run length")
mean(training$capitalAve)
## [1] 4.716994
sd(training$capitalAve)
## [1] 26.82555
trainCapAve <- training$capitalAve
trainCapAveS <- (trainCapAve - mean(trainCapAve))/sd(trainCapAve)
mean(trainCapAveS)
## [1] -6.935532e-18
sd(trainCapAveS)
## [1] 1
testCapAve <- testing$capitalAve
testCapAveS <- (testCapAve - mean(trainCapAve))/sd(trainCapAve)
mean(testCapAveS)
## [1] 0.07077199
sd(testCapAveS)
## [1] 1.610786
preObj <- preProcess(training[,-58],method=c("center","scale"))
trainCapAveS <- predict(preObj,training[,-58])$capitalAve
mean(trainCapAveS)
## [1] -6.935532e-18
sd(trainCapAveS)
## [1] 1
testCapAveS <- predict(preObj,testing[,-58])$capitalAve
mean(testCapAveS)
## [1] 0.07077199
sd(testCapAveS)
## [1] 1.610786
set.seed(32343)
modelFit <- train(type ~.,data=training,
preProcess=c("center","scale"),method="glm")
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictors
## 2 classes: 'nonspam', 'spam'
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9208656 0.8332911
##
##
preObj <- preProcess(training[,-58],method=c("BoxCox"))
trainCapAveS <- predict(preObj,training[,-58])$capitalAve
par(mfrow=c(1,2)); hist(trainCapAveS); qqnorm(trainCapAveS)
set.seed(13343)
# Make some values NA
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1],size=1,prob=0.05)==1
training$capAve[selectNA] <- NA
# Impute and standardize
preObj <- preProcess(training[,-58],method="knnImpute")
capAve <- predict(preObj,training[,-58])$capAve
# Standardize true values
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth-mean(capAveTruth))/sd(capAveTruth)
quantile(capAve - capAveTruth)
## 0% 25% 50% 75% 100%
## -1.1884293998 -0.0008836469 0.0005853112 0.0012905270 1.0145601207
quantile((capAve - capAveTruth)[selectNA])
## 0% 25% 50% 75% 100%
## -1.188429400 -0.011925252 0.003908074 0.025463933 1.014560121
quantile((capAve - capAveTruth)[!selectNA])
## 0% 25% 50% 75% 100%
## -0.9757406838 -0.0008010270 0.0005776479 0.0012414810 0.0018085649
Level 1: From raw data to covariate
Level 2: Transforming tidy covariates
library(kernlab);data(spam)
spam$capitalAveSq <- spam$capitalAve^2
library(ISLR); library(caret); data(Wage);
inTrain <- createDataPartition(y=Wage$wage,
p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
Basic idea - convert factor variables to indicator variables
table(training$jobclass)
##
## 1. Industrial 2. Information
## 1051 1051
dummies <- dummyVars(wage ~ jobclass,data=training)
head(predict(dummies,newdata=training))
## jobclass.1. Industrial jobclass.2. Information
## 86582 0 1
## 161300 1 0
## 155159 0 1
## 11443 0 1
## 376662 0 1
## 450601 1 0
nsv <- nearZeroVar(training,saveMetrics=TRUE)
nsv
## freqRatio percentUnique zeroVar nzv
## year 1.037356 0.33301618 FALSE FALSE
## age 1.027027 2.85442436 FALSE FALSE
## sex 0.000000 0.04757374 TRUE TRUE
## maritl 3.272931 0.23786870 FALSE FALSE
## race 8.938776 0.19029496 FALSE FALSE
## education 1.389002 0.23786870 FALSE FALSE
## region 0.000000 0.04757374 TRUE TRUE
## jobclass 1.000000 0.09514748 FALSE FALSE
## health 2.468647 0.09514748 FALSE FALSE
## health_ins 2.352472 0.09514748 FALSE FALSE
## logwage 1.061728 19.17221694 FALSE FALSE
## wage 1.061728 19.17221694 FALSE FALSE
library(splines)
bsBasis <- bs(training$age,df=3)
head(bsBasis,10)
## 1 2 3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.4163380 0.32117502 0.082587862
## [3,] 0.4308138 0.29109043 0.065560908
## [4,] 0.3625256 0.38669397 0.137491189
## [5,] 0.3063341 0.42415495 0.195763821
## [6,] 0.4241549 0.30633413 0.073747105
## [7,] 0.3776308 0.09063140 0.007250512
## [8,] 0.4443582 0.22759810 0.038858212
## [9,] 0.4422183 0.19539878 0.028779665
## [10,] 0.3625256 0.38669397 0.137491189
See also: ns(),poly()
lm1 <- lm(wage ~ bsBasis,data=training)
plot(training$age,training$wage,pch=19,cex=0.5)
points(training$age,predict(lm1,newdata=training),col="red",pch=19,cex=0.5)
head(predict(bsBasis,age=testing$age))
## 1 2 3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.4163380 0.32117502 0.082587862
## [3,] 0.4308138 0.29109043 0.065560908
## [4,] 0.3625256 0.38669397 0.137491189
## [5,] 0.3063341 0.42415495 0.195763821
## [6,] 0.4241549 0.30633413 0.073747105
\[ X = 0.71 \times {\rm num 415} + 0.71 \times {\rm num857}\]
\[ Y = 0.71 \times {\rm num 415} - 0.71 \times {\rm num857}\]
X <- 0.71*training$num415 + 0.71*training$num857
Y <- 0.71*training$num415 - 0.71*training$num857
plot(X,Y)
smallSpam <- spam[,c(34,32)]
prComp <- prcomp(smallSpam)
plot(prComp$x[,1],prComp$x[,2])
prComp$rotation
## PC1 PC2
## num415 0.7080625 0.7061498
## num857 0.7061498 -0.7080625
typeColor <- ((spam$type=="spam")*1 + 1)
prComp <- prcomp(log10(spam[,-58]+1))
plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab="PC1",ylab="PC2")
preProc <- preProcess(log10(spam[,-58]+1),method="pca",pcaComp=2)
spamPC <- predict(preProc,log10(spam[,-58]+1))
plot(spamPC[,1],spamPC[,2],col=typeColor)
preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
trainPC <- predict(preProc,log10(training[,-58]+1))
modelFit <- train(training$type ~ .,method="glm",data=trainPC)
testPC <- predict(preProc,log10(testing[,-58]+1))
confusionMatrix(testing$type,predict(modelFit,testPC))
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 645 52
## spam 73 380
##
## Accuracy : 0.8913
## 95% CI : (0.8719, 0.9087)
## No Information Rate : 0.6243
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7705
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.8983
## Specificity : 0.8796
## Pos Pred Value : 0.9254
## Neg Pred Value : 0.8389
## Prevalence : 0.6243
## Detection Rate : 0.5609
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.8890
##
## 'Positive' Class : nonspam
##
modelFit <- train(training$type ~ .,method="glm",preProcess="pca",data=training)
confusionMatrix(testing$type,predict(modelFit,testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 658 39
## spam 50 403
##
## Accuracy : 0.9226
## 95% CI : (0.9056, 0.9374)
## No Information Rate : 0.6157
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8372
## Mcnemar's Test P-Value : 0.2891
##
## Sensitivity : 0.9294
## Specificity : 0.9118
## Pos Pred Value : 0.9440
## Neg Pred Value : 0.8896
## Prevalence : 0.6157
## Detection Rate : 0.5722
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.9206
##
## 'Positive' Class : nonspam
##
Pros: * Easy to implement * Easy to interpret
Cons: * Often poor performance in nonlinear settings
library(caret);data(faithful); set.seed(333)
inTrain <- createDataPartition(y=faithful$waiting,
p=0.5, list=FALSE)
trainFaith <- faithful[inTrain,]; testFaith <- faithful[-inTrain,]
head(trainFaith)
## eruptions waiting
## 1 3.600 79
## 3 3.333 74
## 5 4.533 85
## 6 2.883 55
## 7 4.700 88
## 8 3.600 85
plot(trainFaith$waiting,trainFaith$eruptions,pch=19,col="blue",xlab="Waiting",ylab="Duration")
\[ ED_i = b_0 + b_1 WT_i + e_i \]
lm1 <- lm(eruptions ~ waiting,data=trainFaith)
summary(lm1)
##
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26990 -0.34789 0.03979 0.36589 1.05020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.792739 0.227869 -7.867 1.04e-12 ***
## waiting 0.073901 0.003148 23.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.8032, Adjusted R-squared: 0.8018
## F-statistic: 551 on 1 and 135 DF, p-value: < 2.2e-16
plot(trainFaith$waiting,trainFaith$eruptions,pch=19,col="blue",xlab="Waiting",ylab="Duration")
lines(trainFaith$waiting,lm1$fitted,lwd=3)
\[\hat{ED} = \hat{b}_0 + \hat{b}_1 WT\]
coef(lm1)[1] + coef(lm1)[2]*80
## (Intercept)
## 4.119307
newdata <- data.frame(waiting=80)
predict(lm1,newdata)
## 1
## 4.119307
par(mfrow=c(1,2))
plot(trainFaith$waiting,trainFaith$eruptions,pch=19,col="blue",xlab="Waiting",ylab="Duration")
lines(trainFaith$waiting,predict(lm1),lwd=3)
plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue",xlab="Waiting",ylab="Duration")
lines(testFaith$waiting,predict(lm1,newdata=testFaith),lwd=3)
# Calculate RMSE on training
sqrt(sum((lm1$fitted-trainFaith$eruptions)^2))
## [1] 5.75186
# Calculate RMSE on test
sqrt(sum((predict(lm1,newdata=testFaith)-testFaith$eruptions)^2))
## [1] 5.838559
pred1 <- predict(lm1,newdata=testFaith,interval="prediction")
ord <- order(testFaith$waiting)
plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue")
matlines(testFaith$waiting[ord],pred1[ord,],type="l",col=c(1,2,2),lty = c(1,1,1), lwd=3)
modFit <- train(eruptions ~ waiting,data=trainFaith,method="lm")
summary(modFit$finalModel)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26990 -0.34789 0.03979 0.36589 1.05020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.792739 0.227869 -7.867 1.04e-12 ***
## waiting 0.073901 0.003148 23.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.8032, Adjusted R-squared: 0.8018
## F-statistic: 551 on 1 and 135 DF, p-value: < 2.2e-16
Data from: ISLR package from the book: Introduction to statistical learning
library(ISLR); library(ggplot2); library(caret);
data(Wage); Wage <- subset(Wage,select=-c(logwage))
summary(Wage)
## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083
## 2. Information:1456 2. >=Very Good:2142 2. No : 917
##
##
##
##
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
inTrain <- createDataPartition(y=Wage$wage,
p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
dim(training); dim(testing)
## [1] 2102 11
## [1] 898 11
featurePlot(x=training[,c("age","education","jobclass")],
y = training$wage,
plot="pairs")
qplot(age,wage,data=training)
qplot(age,wage,colour=jobclass,data=training)
qplot(age,wage,colour=education,data=training)
\[ ED_i = b_0 + b_1 age + b_2 I(Jobclass_i="Information") + \sum_{k=1}^4 \gamma_k I(education_i= level k) \]
modFit<- train(wage ~ age + jobclass + education,
method = "lm",data=training)
finMod <- modFit$finalModel
print(modFit)
## Linear Regression
##
## 2102 samples
## 10 predictors
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
## Resampling results:
##
## RMSE Rsquared
## 36.06666 0.2517055
##
##
Education levels: 1 = HS Grad, 2 = Some College, 3 = College Grad, 4 = Advanced Degree
plot(finMod,1,pch=19,cex=0.5,col="#00000010")
qplot(finMod$fitted,finMod$residuals,colour=race,data=training)
plot(finMod$residuals,pch=19)
pred <- predict(modFit, testing)
qplot(wage,pred,colour=year,data=testing)
modFitAll<- train(wage ~ .,data=training,method="lm")
pred <- predict(modFitAll, testing)
qplot(wage,pred,data=testing)
Pros:
Cons:
\[\hat{p}_{mk} = \frac{1}{N_m}\sum_{x_i\; in \; Leaf \; m}\mathbb{1}(y_i = k)\]
Misclassification Error: \[ 1 - \hat{p}_{m k(m)}; k(m) = {\rm most; common; k}\] * 0 = perfect purity * 0.5 = no purity
Gini index: \[ \sum_{k \neq k'} \hat{p}_{mk} \times \hat{p}_{mk'} = \sum_{k=1}^K \hat{p}_{mk}(1-\hat{p}_{mk}) = 1 - \sum_{k=1}^K p_{mk}^2\]
http://en.wikipedia.org/wiki/Decision_tree_learning
Deviance/information gain:
\[ -\sum_{k=1}^K \hat{p}_{mk} \log_2\hat{p}_{mk} \] * 0 = perfect purity * 1 = no purity
http://en.wikipedia.org/wiki/Decision_tree_learning
— &twocol w1:50% w2:50%
*** =left
*** =right
data(iris); library(ggplot2)
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
library(caret)
inTrain <- createDataPartition(y=iris$Species,
p=0.7, list=FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
dim(training); dim(testing)
## [1] 105 5
## [1] 45 5
library(ggplot2)
qplot(Petal.Width,Sepal.Width,colour=Species,data=training)
library(caret)
modFit <- train(Species ~ .,method="rpart",data=training)
print(modFit$finalModel)
## n= 105
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 35 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 70 35 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.65 34 1 versicolor (0.00000000 0.97058824 0.02941176) *
## 7) Petal.Width>=1.65 36 2 virginica (0.00000000 0.05555556 0.94444444) *
plot(modFit$finalModel, uniform=TRUE,
main="Classification Tree")
text(modFit$finalModel, use.n=TRUE, all=TRUE, cex=.8)
library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(modFit$finalModel)
predict(modFit,newdata=testing)
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa versicolor versicolor versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] virginica virginica virginica virginica virginica virginica
## [37] versicolor virginica virginica versicolor versicolor virginica
## [43] virginica virginica virginica
## Levels: setosa versicolor virginica
Basic idea:
Notes:
library(ElemStatLearn); data(ozone,package="ElemStatLearn")
##
## Attaching package: 'ElemStatLearn'
## The following object is masked _by_ '.GlobalEnv':
##
## spam
ozone <- ozone[order(ozone$ozone),]
head(ozone)
## ozone radiation temperature wind
## 17 1 8 59 9.7
## 19 4 25 61 9.7
## 14 6 78 57 18.4
## 45 7 48 80 14.3
## 106 7 49 69 10.3
## 7 8 19 61 20.1
http://en.wikipedia.org/wiki/Bootstrap_aggregating
ll <- matrix(NA,nrow=10,ncol=155)
for(i in 1:10){
ss <- sample(1:dim(ozone)[1],replace=T)
ozone0 <- ozone[ss,]; ozone0 <- ozone0[order(ozone0$ozone),]
loess0 <- loess(temperature ~ ozone,data=ozone0,span=0.2)
ll[i,] <- predict(loess0,newdata=data.frame(ozone=1:155))
}
plot(ozone$ozone,ozone$temperature,pch=19,cex=0.5)
for(i in 1:10){lines(1:155,ll[i,],col="grey",lwd=2)}
lines(1:155,apply(ll,2,mean),col="red",lwd=2)
train function consider method optionsbagEarthtreebagbagFDAbag functionlibrary(caret)
predictors = data.frame(ozone=ozone$ozone)
temperature = ozone$temperature
treebag <- bag(predictors, temperature, B = 10,
bagControl = bagControl(fit = ctreeBag$fit,
predict = ctreeBag$pred,
aggregate = ctreeBag$aggregate))
## Warning: executing %dopar% sequentially: no parallel backend registered
http://www.inside-r.org/packages/cran/caret/docs/nbBag
plot(ozone$ozone,temperature,col='lightgrey',pch=19)
points(ozone$ozone,predict(treebag$fits[[1]]$fit,predictors),pch=19,col="red")
points(ozone$ozone,predict(treebag,predictors),pch=19,col="blue")
ctreeBag$fit
## function (x, y, ...)
## {
## loadNamespace("party")
## data <- as.data.frame(x)
## data$y <- y
## party::ctree(y ~ ., data = data)
## }
## <environment: namespace:caret>
ctreeBag$pred
## function (object, x)
## {
## if (!is.data.frame(x))
## x <- as.data.frame(x)
## obsLevels <- levels(object@data@get("response")[, 1])
## if (!is.null(obsLevels)) {
## rawProbs <- party::treeresponse(object, x)
## probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels),
## byrow = TRUE)
## out <- data.frame(probMatrix)
## colnames(out) <- obsLevels
## rownames(out) <- NULL
## }
## else out <- unlist(party::treeresponse(object, x))
## out
## }
## <environment: namespace:caret>
ctreeBag$aggregate
## function (x, type = "class")
## {
## if (is.matrix(x[[1]]) | is.data.frame(x[[1]])) {
## pooled <- x[[1]] & NA
## classes <- colnames(pooled)
## for (i in 1:ncol(pooled)) {
## tmp <- lapply(x, function(y, col) y[, col], col = i)
## tmp <- do.call("rbind", tmp)
## pooled[, i] <- apply(tmp, 2, median)
## }
## if (type == "class") {
## out <- factor(classes[apply(pooled, 1, which.max)],
## levels = classes)
## }
## else out <- as.data.frame(pooled)
## }
## else {
## x <- matrix(unlist(x), ncol = length(x))
## out <- apply(x, 1, median)
## }
## out
## }
## <environment: namespace:caret>
Notes:
Further resources:
Pros:
Cons:
data(iris); library(ggplot2); library(caret)
inTrain <- createDataPartition(y=iris$Species,
p=0.7, list=FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
library(caret)
library(randomForest)
modFit <- train(Species~ .,data=training,method="rf",prox=TRUE)
modFit
## Random Forest
##
## 105 samples
## 4 predictors
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 105, 105, 105, 105, 105, 105, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9377826 0.9058122
## 3 0.9378352 0.9058753
## 4 0.9327083 0.8981613
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
library(randomForest)
getTree(modFit$finalModel,k=2)
## left daughter right daughter split var split point status prediction
## 1 2 3 4 0.80 1 0
## 2 0 0 0 0.00 -1 1
## 3 4 5 4 1.75 1 0
## 4 6 7 3 5.05 1 0
## 5 8 9 3 5.00 1 0
## 6 0 0 0 0.00 -1 2
## 7 0 0 0 0.00 -1 3
## 8 10 11 1 5.95 1 0
## 9 0 0 0 0.00 -1 3
## 10 0 0 0 0.00 -1 2
## 11 0 0 0 0.00 -1 3
irisP <- classCenter(training[,c(3,4)], training$Species, modFit$finalModel$prox)
irisP <- as.data.frame(irisP); irisP$Species <- rownames(irisP)
p <- qplot(Petal.Width, Petal.Length, col=Species,data=training)
p + geom_point(aes(x=Petal.Width,y=Petal.Length,col=Species),size=5,shape=4,data=irisP)
pred <- predict(modFit,testing); testing$predRight <- pred==testing$Species
table(pred,testing$Species)
##
## pred setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 15 0
## virginica 0 0 15
qplot(Petal.Width,Petal.Length,colour=predRight,data=testing,main="newdata Predictions")
Notes:
Further resources:
http://webee.technion.ac.il/people/rmeir/BoostingTutorial.pdf
Pros:
Cons:
Our goal is to build parametric model for conditional distribution \(P(Y = k | X = x)\)
A typical approach is to apply Bayes theorem: \[ Pr(Y = k | X=x) = \frac{Pr(X=x|Y=k)Pr(Y=k)}{\sum_{\ell=1}^K Pr(X=x |Y = \ell) Pr(Y=\ell)}\] \[Pr(Y = k | X=x) = \frac{f_k(x) \pi_k}{\sum_{\ell = 1}^K f_{\ell}(x) \pi_{\ell}}\]
Typically prior probabilities \(\pi_k\) are set in advance.
A common choice for \(f_k(x) = \frac{1}{\sigma_k \sqrt{2 \pi}}e^{-\frac{(x-\mu_k)^2}{\sigma_k^2}}\), a Gaussian distribution
Estimate the parameters (\(\mu_k\),\(\sigma_k^2\)) from the data.
Classify to the class with the highest value of \(P(Y = k | X = x)\)
A range of models use this approach
http://statweb.stanford.edu/~tibs/ElemStatLearn/
\[log \frac{Pr(Y = k | X=x)}{Pr(Y = j | X=x)}\] \[ = log \frac{f_k(x)}{f_j(x)} + log \frac{\pi_k}{\pi_j}\] \[ = log \frac{\pi_k}{\pi_j} - \frac{1}{2}(\mu_k + \mu_j)^T \Sigma^{-1}(\mu_k + \mu_j)\] \[ + x^T \Sigma^{-1} (\mu_k - \mu_j)\]
http://statweb.stanford.edu/~tibs/ElemStatLearn/
\[\delta_k(x) = x^T \Sigma^{-1} \mu_k - \frac{1}{2}\mu_k \Sigma^{-1}\mu_k + log(\mu_k)\]
Suppose we have many predictors, we would want to model: \(P(Y = k | X_1,\ldots,X_m)\)
We could use Bayes Theorem to get:
\[P(Y = k | X_1,\ldots,X_m) = \frac{\pi_k P(X_1,\ldots,X_m| Y=k)}{\sum_{\ell = 1}^K P(X_1,\ldots,X_m | Y=k) \pi_{\ell}}\] \[ \propto \pi_k P(X_1,\ldots,X_m| Y=k)\]
This can be written:
\[P(X_1,\ldots,X_m, Y=k) = \pi_k P(X_1 | Y = k)P(X_2,\ldots,X_m | X_1,Y=k)\] \[ = \pi_k P(X_1 | Y = k) P(X_2 | X_1, Y=k) P(X_3,\ldots,X_m | X_1,X_2, Y=k)\] \[ = \pi_k P(X_1 | Y = k) P(X_2 | X_1, Y=k)\ldots P(X_m|X_1\ldots,X_{m-1},Y=k)\]
We could make an assumption to write this:
\[ \approx \pi_k P(X_1 | Y = k) P(X_2 | Y = k)\ldots P(X_m |,Y=k)\]
data(iris); library(ggplot2)
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
library(caret)
inTrain <- createDataPartition(y=iris$Species,
p=0.7, list=FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
dim(training); dim(testing)
## [1] 105 5
## [1] 45 5
library(klaR); library(MASS)
## Loading required package: MASS
modlda = train(Species ~ .,data=training,method="lda")
modnb = train(Species ~ ., data=training,method="nb")
plda = predict(modlda,testing); pnb = predict(modnb,testing)
table(plda,pnb)
## pnb
## plda setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 15 0
## virginica 0 1 14
equalPredictions = (plda==pnb)
qplot(Petal.Width,Sepal.Width,colour=equalPredictions,data=testing)
Pros:
Cons:
\[Y = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \epsilon\]
where \(X_1\) and \(X_2\) are nearly perfectly correlated (co-linear). You can approximate this model by:
\[Y = \beta_0 + (\beta_1 + \beta_2)X_1 + \epsilon\]
The result is:
library(ElemStatLearn); data(prostate)
str(prostate)
## 'data.frame': 97 obs. of 10 variables:
## $ lcavol : num -0.58 -0.994 -0.511 -1.204 0.751 ...
## $ lweight: num 2.77 3.32 2.69 3.28 3.43 ...
## $ age : int 50 58 74 58 62 50 64 58 47 63 ...
## $ lbph : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ svi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lcp : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ gleason: int 6 6 7 6 6 6 6 6 6 6 ...
## $ pgg45 : int 0 0 20 0 0 0 0 0 0 0 ...
## $ lpsa : num -0.431 -0.163 -0.163 -0.163 0.372 ...
## $ train : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
No method better when data/computation time permits it
http://www.biostat.jhsph.edu/~ririzarr/Teaching/649/ http://www.cbcb.umd.edu/~hcorrada/PracticalML/
Assume \(Y_i = f(X_i) + \epsilon_i\)
\(EPE(\lambda) = E\left[\{Y - \hat{f}_{\lambda}(X)\}^2\right]\)
Suppose \(\hat{f}_{\lambda}\) is the estimate from the training data and look at a new data point \(X = x^*\)
\[E\left[\{Y - \hat{f}_{\lambda}(x^*)\}^2\right] = \sigma^2 + \{E[\hat{f}_{\lambda}(x^*)] - f(x^*)\}^2 + var[\hat{f}_\lambda(x_0)]\]
http://www.biostat.jhsph.edu/~ririzarr/Teaching/649/ http://www.cbcb.umd.edu/~hcorrada/PracticalML/
small = prostate[1:5,]
lm(lpsa ~ .,data =small)
##
## Call:
## lm(formula = lpsa ~ ., data = small)
##
## Coefficients:
## (Intercept) lcavol lweight age lbph
## 9.60615 0.13901 -0.79142 0.09516 NA
## svi lcp gleason pgg45 trainTRUE
## NA NA -2.08710 NA NA
http://www.biostat.jhsph.edu/~ririzarr/Teaching/649/ http://www.cbcb.umd.edu/~hcorrada/PracticalML/
Model \(Y = f(X) + \epsilon\)
Set \(\hat{f}_{\lambda}(x) = x'\beta\)
Constrain only \(\lambda\) coefficients to be nonzero.
Selection problem is after chosing \(\lambda\) figure out which \(p - \lambda\) coefficients to make nonzero
http://www.biostat.jhsph.edu/~ririzarr/Teaching/649/ http://www.cbcb.umd.edu/~hcorrada/PracticalML/
If the \(\beta_j\)’s are unconstrained: * They can explode * And hence are susceptible to very high variance
To control variance, we might regularize/shrink the coefficients.
\[ PRSS(\beta) = \sum_{j=1}^n (Y_j - \sum_{i=1}^m \beta_{1i} X_{ij})^2 + P(\lambda; \beta)\]
where \(PRSS\) is a penalized form of the sum of squares. Things that are commonly looked for
Solve:
\[ \sum_{i=1}^N \left(y_i - \beta_0 + \sum_{j=1}^p x_{ij}\beta_j \right)^2 + \lambda \sum_{j=1}^p \beta_j^2\]
equivalent to solving
\(\sum_{i=1}^N \left(y_i - \beta_0 + \sum_{j=1}^p x_{ij}\beta_j \right)^2\) subject to \(\sum_{j=1}^p \beta_j^2 \leq s\) where \(s\) is inversely proportional to \(\lambda\)
Inclusion of \(\lambda\) makes the problem non-singular even if \(X^TX\) is not invertible.
http://www.biostat.jhsph.edu/~ririzarr/Teaching/649/ http://www.cbcb.umd.edu/~hcorrada/PracticalML/
http://www.biostat.jhsph.edu/~ririzarr/Teaching/649/ http://www.cbcb.umd.edu/~hcorrada/PracticalML/
\(\sum_{i=1}^N \left(y_i - \beta_0 + \sum_{j=1}^p x_{ij}\beta_j \right)^2\) subject to \(\sum_{j=1}^p |\beta_j| \leq s\)
also has a lagrangian form
\[ \sum_{i=1}^N \left(y_i - \beta_0 + \sum_{j=1}^p x_{ij}\beta_j \right)^2 + \lambda \sum_{j=1}^p |\beta_j|\]
For orthonormal design matrices (not the norm!) this has a closed form solution
\[\hat{\beta}_j = sign(\hat{\beta}_j^0)(|\hat{\beta}_j^0 - \gamma)^{+}\]
but not in general.
http://www.biostat.jhsph.edu/~ririzarr/Teaching/649/ http://www.cbcb.umd.edu/~hcorrada/PracticalML/
caret methods are:ridgelassorelaxoSuppose we have 5 completely independent classifiers
If accuracy is 70% for each: * \(10\times(0.7)^3(0.3)^2 + 5\times(0.7)^4(0.3)^2 + (0.7)^5\) * 83.7% majority vote accuracy
With 101 independent classifiers * 99.9% majority vote accuracy
Create training, test and validation sets
library(ISLR); data(Wage); library(ggplot2); library(caret);
Wage <- subset(Wage,select=-c(logwage))
# Create a building data set and validation set
inBuild <- createDataPartition(y=Wage$wage,
p=0.7, list=FALSE)
validation <- Wage[-inBuild,]; buildData <- Wage[inBuild,]
inTrain <- createDataPartition(y=buildData$wage,
p=0.7, list=FALSE)
training <- buildData[inTrain,]; testing <- buildData[-inTrain,]
Create training, test and validation sets
dim(training)
## [1] 1474 11
dim(testing)
## [1] 628 11
dim(validation)
## [1] 898 11
mod1 <- train(wage ~.,method="glm",data=training)
mod2 <- train(wage ~.,method="rf",
data=training,
trControl = trainControl(method="cv"),number=3)
pred1 <- predict(mod1,testing); pred2 <- predict(mod2,testing)
qplot(pred1,pred2,colour=wage,data=testing)
predDF <- data.frame(pred1,pred2,wage=testing$wage)
combModFit <- train(wage ~.,method="gam",data=predDF)
combPred <- predict(combModFit,predDF)
sqrt(sum((pred1-testing$wage)^2))
## [1] 880.3408
sqrt(sum((pred2-testing$wage)^2))
## [1] 916.7846
sqrt(sum((combPred-testing$wage)^2))
## [1] 844.436
pred1V <- predict(mod1,validation); pred2V <- predict(mod2,validation)
predVDF <- data.frame(pred1=pred1V,pred2=pred2V)
combPredV <- predict(combModFit,predVDF)
sqrt(sum((pred1V-validation$wage)^2))
## [1] 981.3656
sqrt(sum((pred2V-validation$wage)^2))
## [1] 1019.503
sqrt(sum((combPredV-validation$wage)^2))
## [1] 987.868
data(iris); library(ggplot2); library(caret)
inTrain <- createDataPartition(y=iris$Species,
p=0.7, list=FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
dim(training); dim(testing)
## [1] 105 5
## [1] 45 5
kMeans1 <- kmeans(subset(training,select=-c(Species)),centers=3)
training$clusters <- as.factor(kMeans1$cluster)
qplot(Petal.Width,Petal.Length,colour=clusters,data=training)
table(kMeans1$cluster,training$Species)
##
## setosa versicolor virginica
## 1 0 1 26
## 2 0 34 9
## 3 35 0 0
modFit <- train(clusters ~.,data=subset(training,select=-c(Species)),method="rpart")
table(predict(modFit,training),training$Species)
##
## setosa versicolor virginica
## 1 0 0 24
## 2 0 35 11
## 3 35 0 0
testClusterPred <- predict(modFit,testing)
table(testClusterPred ,testing$Species)
##
## testClusterPred setosa versicolor virginica
## 1 0 0 10
## 2 0 15 5
## 3 15 0 0
http://www.google.com/trends/correlate
http://www.newscientist.com/blogs/onepercent/2011/05/google-correlate-passes-our-we.html
library(quantmod); library(forecast)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
##
## Attaching package: 'quantmod'
## The following object is masked from 'package:Hmisc':
##
## Lag
## Loading required package: timeDate
## This is forecast 7.2
##
## Attaching package: 'forecast'
## The following object is masked from 'package:nlme':
##
## getResponse
from.dat <- as.Date("01/01/08", format="%m/%d/%y")
to.dat <- as.Date("12/31/13", format="%m/%d/%y")
getSymbols("AAPL", src="google", from = from.dat, to = to.dat)
## As of 0.4-0, 'getSymbols' uses env=parent.frame() and
## auto.assign=TRUE by default.
##
## This behavior will be phased out in 0.5-0 when the call will
## default to use auto.assign=FALSE. getOption("getSymbols.env") and
## getOptions("getSymbols.auto.assign") are now checked for alternate defaults
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for more details.
## [1] "AAPL"
head(AAPL)
## AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume
## 2008-01-02 28.47 28.61 27.51 27.83 269794140
## 2008-01-03 27.92 28.20 27.53 27.85 210516460
## 2008-01-04 27.35 27.57 25.56 25.72 363888854
## 2008-01-07 25.89 26.23 24.32 25.38 518047922
## 2008-01-08 25.73 26.07 24.40 24.46 380953888
## 2008-01-09 24.50 25.60 24.00 25.60 453884711
library(xts); library(quantmod)
mAAPL <- to.monthly(AAPL)
googOpen <- Op(mAAPL)
ts1 <- ts(googOpen,frequency=12)
plot(ts1,xlab="Years+1", ylab="GOOG")
https://www.otexts.org/fpp/6/1
plot(decompose(ts1),xlab="Years+1")
ts1Train <- window(ts1,start=1,end=5)
ts1Test <- window(ts1,start=5,end=(7-0.01))
## Warning in window.default(x, ...): 'end' value not changed
ts1Train
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1 28.47 19.46 17.78 20.90 24.99 26.94 23.46 22.84 24.63 15.99 15.13 13.04
## 2 12.27 12.73 12.59 14.87 17.97 19.50 20.50 23.60 24.00 26.48 27.11 28.89
## 3 30.49 27.48 29.39 33.57 37.69 37.10 36.33 37.21 35.35 40.88 43.17 45.04
## 4 46.52 48.76 50.78 50.16 49.96 49.84 47.99 56.83 55.12 54.34 56.77 54.65
## 5 58.49
\[ Y_{t}=\frac{1}{2*k+1}\sum_{j=-k}^k {y_{t+j}}\]
library(forecast)
plot(ts1Train)
lines(ma(ts1Train,order=3),col="red")
Example - simple exponential smoothing \[\hat{y}_{t+1} = \alpha y_t + (1-\alpha)\hat{y}_{t-1}\]
https://www.otexts.org/fpp/7/6
ets1 <- ets(ts1Train,model="MMM")
fcast <- forecast(ets1)
plot(fcast); lines(ts1Test,col="red")
accuracy(fcast,ts1Test)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.4686736 3.052083 2.258242 -1.976207 8.194949 0.1724703
## Test set 0.5203495 19.517108 18.175196 -2.233385 24.308895 1.3881069
## ACF1 Theil's U
## Training set -0.00786701 NA
## Test set 0.92914866 3.282191